Topics Covered in this notebook
Authors: Pulkit Gupta, Advait Save
Date: 08-October-2018
from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, validation_curve
from sklearn import metrics
from sklearn.metrics import (accuracy_score, roc_auc_score, confusion_matrix, roc_curve, auc,
mean_squared_error, log_loss, precision_recall_curve, classification_report,
precision_recall_fscore_support)
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from xgboost.sklearn import XGBClassifier
import scikitplot as skplt
import lime
import lime.lime_tabular
from sklearn.externals.six import StringIO
from IPython.display import Image
import pydotplus
import os
#Add Graphviz path to environment variable PATH, used for visualizing decision trees
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'
#Import supporting functions
from calculate_model_metrics import lift_plot_model, plot_roc, plot_grid_search, evaluate_model
import warnings
warnings.filterwarnings("ignore")
data = pd.read_csv("../data/churn.csv")
print(data.shape)
data.head(5)
data.describe()
data['Area Code'] = data['Area Code'].astype('object')
data.describe(include=['O'])
print("Churn Rate: {} %".format(round(sum(np.where(data['Churn?'] == 'False.',0,1))*100.0/len(data['Churn?']),2)))
target_variable = ["Churn?"]
id_variable = ["Phone"]
Observations:
sns.heatmap(data.corr(),annot=True,cmap='RdYlGn',linewidths=0.2) #data.corr()-->correlation matrix
fig=plt.gcf()
fig.set_size_inches(20,16)
plt.show()
#Dropping fields that are correlated to other fields
data.drop(columns = ['Day Charge','Eve Charge','Night Charge','Intl Charge'],inplace=True)
total = data.isnull().sum().sort_values(ascending = False)
percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
missing_train_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_train_data
Pair plots (Bi-variate scatter plots) can be used similar to correlation plots, to observe correlations and also compare patterns in the target variable
Here we plot the pair plots, with the target variable "Churn" in hue i.e. The Blue and Red points represent the two values of Churn - yes/no.
The diagonal shows the distribution of the two target classes against each variable.
Parameters used:
Observations:
pp = sns.pairplot(data, hue = 'Churn?', palette = 'deep', size=2.5, diag_kind = 'kde', diag_kws=dict(shade=True), plot_kws=dict(s=10) )
pp.set(xticklabels=[])
plt.show()
X = data.drop(target_variable + id_variable,axis=1)
y = data[target_variable]
ids = data[id_variable]
Since some Classifier algorithms do not accept categorical fields, they need to be transformed
There are two methods for transforming categorical fields:
1. Label encoding - each unique category value is assigned an integer value
2. One Hot encoding - the integer encoded variable is removed and a new binary variable is added for each unique integer value
The first one is not preferred since the classification algorithms tend to assume a natural ordering between categories, which may result in poor performance or unexpected results
X_with_dummy_features = pd.get_dummies(X,drop_first=True)
y_with_dummy_features = pd.get_dummies(y,drop_first=True)
A better way to test a model is to use a hold-out set which doesn't enter the training. This operation can be performed using scikit-learn's train/test split utility: Stratified sampling is a probability sampling technique; to divide the entire population into different subgroups or strata, then randomly selects the final subjects proportionally from the different strata.
xtrain, xtest, ytrain, ytest , id_train, id_test = train_test_split \
(X_with_dummy_features, y_with_dummy_features,ids,test_size=0.5, \
stratify=y_with_dummy_features,random_state=1000)
print ("No. of True Cases in training data set for" , ytrain.values.ravel().sum())
print ("No. of True Cases in testing data set for",ytest.values.ravel().sum())
print ("Ratio of True Cases in training data set: " , round(ytrain.values.ravel().sum()/len(ytrain.values.ravel()),2))
print ("Ratio of True Cases in testing data set: ", round(ytest.values.ravel().sum()/len(ytest.values.ravel()),2))
Logistic regression is an easy to interpret and computationally inexpensive classification algorithm. It uses the natural logarithm function to find the relationship between the variables and uses test data to find the coefficients.
#initialize model performance comparison dataframe
eval_results_compare = pd.DataFrame()
model_name = 'Logistic Regression'
logreg = LogisticRegression()
trained_model_lr = logreg.fit(xtrain, ytrain.values.ravel())
print ("Trained model :: ", trained_model_lr)
prob_test = trained_model_lr.predict_proba(xtest)
prob_train = trained_model_lr.predict_proba(xtrain)
#Probability Threshold = 0.5 (default)
pred_test = trained_model_lr.predict(xtest)
pred_train = trained_model_lr.predict(xtrain)
AUC Score (Area under the Curve)
#Calculate AUC
auc_score = roc_auc_score(ytest, prob_test[:,1])
print ("\n\nModel ROC-AUC score for validation sample: %.3f" \
% auc_score)
Train and test Accuracy
#Calculate train and test accuracy
train_acc = accuracy_score(ytrain.values.ravel(), pred_train)
test_acc = accuracy_score(ytest.values.ravel(), pred_test)
print ("\nTrain Accuracy :: ", train_acc)
print ("\nTest Accuracy :: ", test_acc)
Confusion matrix
print ("\n Confusion matrix: \n")
skplt.metrics.plot_confusion_matrix(ytest.values.ravel(), pred_test, title="Confusion Matrix",
text_fontsize='large')
plt.show()
# Colors https://matplotlib.org/examples/color/colormaps_reference.html
Precision, recall, F1-score
Recall: Proportion of actual positives (TP + FN) that were identified correctly (TP) --> TP / (TP + FN)
Precision: Proportion of positive identifications (TP + FP) that were actually correct (TP) --> TP / (TP + FP)
F1 score: single metric that combines recall and precision using the harmonic mean
#Calculate classification model evaluation metrics like precision, recall, f1 score
report = classification_report(ytest, pred_test)
precision,recall,fscore,support = precision_recall_fscore_support(ytest,pred_test,average='weighted')
print("\n Classification report (weighted average across classes) ::\n", classification_report(ytest, pred_test))
#Obtain the precision and recall for the corresponding variations in threshold
p, r, thresholds = precision_recall_curve(ytest, prob_test[:,1])
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
"""
Modified from:
Hands-On Machine learning with Scikit-Learn
"""
plt.figure(figsize=(8, 8))
plt.title("Precision and Recall Scores (Positive class) vs. Decision threshold")
plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
plt.ylabel("Score")
plt.xlabel("Decision Threshold")
plt.legend(loc='best')
plot_precision_recall_vs_threshold(p, r, thresholds)
plt.show()
#Calculate key performance metrics for model validation
eval_result_LR = evaluate_model(model_name, trained_model_lr, xtrain, xtest, ytrain, ytest, verbose = False)
#Append key performance metrics to comparison dataframe
eval_results_compare = eval_results_compare.append(eval_result_LR)
eval_results_compare.reset_index(drop=True,inplace=True)
eval_results_compare
Simple probabilistic classifier based on applying Bayes' theorem with strong (naive) independence assumptions between the features
model_name = 'Gaussian Naive Bayes'
gnb = GaussianNB()
trained_model_gnb = gnb.fit(xtrain, ytrain.values.ravel())
#Calculate key performance metrics for model validation
eval_result_GNB = evaluate_model(model_name, trained_model_gnb, xtrain, xtest, ytrain, ytest, verbose = False)
#Append key performance metrics to comparison dataframe
eval_results_compare = eval_results_compare.append(eval_result_GNB)
eval_results_compare.reset_index(drop=True,inplace=True)
eval_results_compare
eval_results_compare.reset_index(drop = True)
Decision Tree algorithm is used to create a training model which can use to predict class or value of target variables by learning decision rules inferred from prior data(training data).
model_name = 'Decision Tree'
dt=DecisionTreeClassifier()
trained_model_dt = dt.fit(xtrain,ytrain)
dot_data = StringIO()
export_graphviz(trained_model_dt, out_file=dot_data,
feature_names = xtrain.columns,
max_depth = 3,
class_names = ['0','1'],
rounded = True, proportion = False,
precision = 2, filled = True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
#Calculate key performance metrics for model validation
eval_result_DT = evaluate_model(model_name, trained_model_dt, xtrain, xtest, ytrain, ytest, verbose = False)
#Append key performance metrics to comparison dataframe
eval_results_compare = eval_results_compare.append(eval_result_DT)
eval_results_compare.reset_index(drop=True,inplace=True)
eval_results_compare.reset_index(drop = True)
Random Forest is a flexible, easy to use machine learning algorithm that produces, a great result most of the time. Random forest builds multiple decision trees and merges them together to get a more accurate and stable prediction
# Create random forest classifier model to predict chrun
model_name = 'Random Forest'
clf = RandomForestClassifier()
trained_model_rf = clf.fit(xtrain, ytrain.values.ravel())
# save the model to disk
filename = '../models/baseline_randomforest_model.sav'
pickle.dump(trained_model_rf, open(filename, 'wb'))
# some time later...
# load the model from disk
trained_model_rf = pickle.load(open(filename, 'rb'))
#Calculate key performance metrics for model validation
eval_result_RF = evaluate_model(model_name, trained_model_rf, xtrain, xtest, ytrain, ytest, verbose = False)
#Append model results to a comparison dataframe
eval_results_compare = eval_results_compare.append(eval_result_RF)
eval_results_compare.reset_index(drop = True)
Gradient boosting is a machine learning technique for regression and classification problems, which produces a prediction model in the form of an ensemble of weak prediction models, typically decision trees.
model_name = 'Gradient Boosting Classifier'
grb = GradientBoostingClassifier()
trained_model_grb = grb.fit(xtrain,ytrain.values.ravel())
#Calculate key performance metrics for model validation
eval_result_GRB = evaluate_model(model_name, trained_model_grb, xtrain, xtest, ytrain, ytest, verbose = False)
#Append key performance metrics to comparison dataframe
eval_results_compare = eval_results_compare.append(eval_result_GRB)
eval_results_compare.reset_index(drop=True,inplace=True)
eval_results_compare.reset_index(drop = True)
XGBoost is an implementation of gradient boosted decision trees designed for speed and performance. It is a scalable and accurate implementation of gradient boosting machines and it has proven to push the limits of computing power for boosted trees algorithms as it was built and developed for the sole purpose of model performance and computational speed
Parameters used:
eval_metric = 'error': Evaluation metrics for validation data, error = (# wrong cases) / (# all cases)
Benefits - Fast, efficient, scalable and flexible
model_name = 'XGBoost'
xgb = XGBClassifier(objective = 'binary:logistic', eval_metric="error")
trained_model_xgb = xgb.fit(xtrain,ytrain.values.ravel())
#Calculate key performance metrics for model validation
eval_result_XGB = evaluate_model(model_name, trained_model_xgb, xtrain, xtest, ytrain, ytest, verbose = False)
#Append key performance metrics to comparison dataframe
eval_results_compare = eval_results_compare.append(eval_result_XGB)
eval_results_compare.reset_index(drop=True,inplace=True)
eval_results_compare.reset_index(drop = True)
GridSearchCV takes a scoring parameter used to choose the best performing model. The various metrics that can be used for scoring along with their usage scenarios:
Note: Accuracy Paradox for Predictive Analytics states that Predictive Models with a given level of Accuracy may have greater Predictive Power than Models with higher Accuracy, expecially in the case of imbalanced classes. A model might be predicting the majority class with high accuracy resulting in a high accuracy score, but it performs poorly on the other class.
In this case, we use recall as the scoring parameter since the objective is to correctly predict as many churners as possible. We are not concerned about False positives since there is not much of a loss in wrongly predicting a Churner.
List of tuning parameters:
# Create range of values for parameter
param_range = np.arange(1, 1000, 8)
# Calculate accuracy on training and test set using range of parameter values
train_scores, test_scores = validation_curve(RandomForestClassifier(),
xtrain,
ytrain,
param_name="n_estimators",
param_range=param_range,
cv=3,
scoring="accuracy",
n_jobs=4,
verbose = 2)
# Calculate mean and standard deviation for training set scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
# Calculate mean and standard deviation for test set scores
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
# Plot mean accuracy scores for training and test sets
plt.plot(param_range, train_mean, label="Training score", color="black")
plt.plot(param_range, test_mean, label="Cross-validation score", color="dimgrey")
# Plot accurancy bands for training and test sets
plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, color="gray")
plt.fill_between(param_range, test_mean - test_std, test_mean + test_std, color="gainsboro")
# Create plot
plt.title("Validation Curve With Random Forest")
plt.xlabel("Number Of Trees")
plt.ylabel("CV Accuracy")
plt.tight_layout()
plt.legend(loc="best")
plt.show()
# Create the parameter grid based on the results of random search
model_name = 'Random Forest - Hp tuned'
param_grid = {
'max_depth': [50, 100, 150, 200, 250, 300],
'n_estimators': [50, 100, 200, 300, 400, 500]
}
rf = RandomForestClassifier()
tuning_rf = GridSearchCV(estimator = rf, param_grid = param_grid, scoring = 'recall',
cv = 3, n_jobs = 4, verbose = 2)
tuning_rf.fit(xtrain, ytrain)
plot_grid_search(tuning_rf.cv_results_, param_grid['n_estimators'], param_grid['max_depth'], 'N Estimators', 'Max Depth')
print('Grid search results:\nBest parameters: {}, Best mean cross-validated score: {}'.format(tuning_rf.best_params_, tuning_rf.best_score_))
#Calculate key performance metrics for model validation
eval_result_GRB_tuned = evaluate_model(model_name, tuning_rf.best_estimator_ , xtrain, xtest, ytrain, ytest, verbose = False)
#Append key performance metrics to comparison dataframe
eval_results_compare = eval_results_compare.append(eval_result_GRB_tuned)
eval_results_compare.reset_index(drop=True,inplace=True)
eval_results_compare.reset_index(drop = True)
Get the predicted probabilities from the best Random Forest classifier
trained_model = tuning_rf.best_estimator_
prob_test = trained_model.predict_proba(xtest)
prob_train = trained_model.predict_proba(xtrain)
Distribution of predicted probabilities of Random Forest classifier
sns.distplot(prob_test[:,1])
Check Precision-Recall vs Decision Threshold Plot to set the optimal threshold
p, r, thresholds = precision_recall_curve(ytest, prob_test[:,1])
plot_precision_recall_vs_threshold(p, r, thresholds)
threshold = 0.3
predicted = (prob_test [:,1] >= threshold).astype('int')
print("\n Classification report (weighted average across classes) ::\n", classification_report(ytest, predicted))
#Calculate key performance metrics for model validation
model_name = 'Random Forest - Hp, threshold tuned'
eval_result_GRB_tuned = evaluate_model(model_name, tuning_rf.best_estimator_ , xtrain, xtest, ytrain, ytest, verbose = False, threshold = 0.3)
#Append key performance metrics to comparison dataframe
eval_results_compare = eval_results_compare.append(eval_result_GRB_tuned)
eval_results_compare.reset_index(drop=True,inplace=True)
eval_results_compare.reset_index(drop = True)
Ensembling is a good way to increase the accuracy or performance of a model. In simple words, it is the combination of various simple models to create a single powerful model.
Ensembling can be done in ways like:
1) Voting Classifier
2) Bagging
3) Boosting
model_name = 'Ensemble - XGB and Gradient Boosting'
xgb = XGBClassifier(objective = 'binary:logistic', eval_metric="error",
nthreads=2)
grb = GradientBoostingClassifier()
model = VotingClassifier(estimators=[('xgb', xgb), ('GBM', grb)], voting='soft')
trained_model_ensemble = model.fit(xtrain,ytrain)
#Calculate key performance metrics for model validation
eval_result_ensemble = evaluate_model(model_name, trained_model_ensemble , xtrain, xtest, ytrain, ytest, verbose = False)
#Append key performance metrics to comparison dataframe
eval_results_compare = eval_results_compare.append(eval_result_ensemble)
eval_results_compare.reset_index(drop=True,inplace=True)
eval_results_compare.reset_index(drop = True)
cols = list(set(eval_results_compare.columns) - set(['Model']))
eval_results_compare[cols] = eval_results_compare[cols].astype(float)
num_cols = eval_results_compare.select_dtypes(float).columns
eval_results_compare[num_cols] = round(eval_results_compare[num_cols],4)
#Define colormap
cm = sns.light_palette("green", as_cmap=True)
eval_results_compare.sort_values(by='Test Accuracy',ascending=False).reset_index(drop=True).style.background_gradient(cmap = cm, high = 0.5, low = -0.5, axis = 0)
#Define all models
models = [
{
'label': 'Logistic Regression',
'model': trained_model_lr,
},
{
'label': 'Gaussian Naive Bayes',
'model': trained_model_gnb,
},
{
'label': 'Decision Tree',
'model': trained_model_dt,
},
{
'label': 'Random Forest',
'model': trained_model_rf,
},
{
'label': 'Gradient Boosting Classifier',
'model': trained_model_grb,
},
{
'label': 'XGBoost',
'model': trained_model_xgb,
},
{
'label': 'Random Forest - tuned',
'model': tuning_rf.best_estimator_,
}
,
{
'label': 'Ensemble - XGB and Gradient Boosting',
'model': trained_model_ensemble
}
]
#Below for loop iterates through your models list and plots the ROC curves for each
plt.figure(figsize=(9,9))
for m in models:
model = m['model'] # select the model
y_pred=model.predict(xtrain) # predict the test data
# Compute False postive rate, and True positive rate
fpr, tpr, thresholds = metrics.roc_curve(ytest, model.predict_proba(xtest)[:,1])
# Calculate Area under the curve to display on the plot
auc = metrics.roc_auc_score(ytest,model.predict(xtest))
# Now, plot the computed values
plt.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % (m['label'], auc))
# Custom settings for the plot
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('1-Specificity(False Positive Rate)')
plt.ylabel('Sensitivity(True Positive Rate)')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show() # Display
#Function to plot model vs performance metric bar chart
def plot_model_eval(data,metric):
data = data.sort_values(by = metric)
data[['Model', metric]].sort_values(by = metric).plot(kind='barh', alpha=0.55)
plt.ylabel('Model', fontsize=15)
plt.xlabel(metric, fontsize=15)
plt.yticks(range(0,len(data.Model)), data.Model, fontsize=10)
metric = 'Recall_1'
plot_model_eval(eval_results_compare,metric)
metric = 'Test Accuracy'
plot_model_eval(eval_results_compare,metric)
var_imp_gbc = pd.DataFrame({'Feature':xtrain.columns,'Var Imp':trained_model_grb.feature_importances_}).sort_values(by='Var Imp',ascending = False).reset_index(drop=True)
print("Top 15 - Gradient Boosting Feature Importance")
var_imp_gbc.head(15).style.background_gradient(cmap = cm, high = 0.6, low = 0.3, axis = 0)
var_imp_xgb = pd.DataFrame({'Feature':xtrain.columns,'Var Imp':trained_model_xgb.feature_importances_}).sort_values(by='Var Imp',ascending = False).reset_index(drop=True)
print("Top 15 - XGB Feature Importance")
var_imp_xgb.head(15).style.background_gradient(cmap = cm, high = 0.6, low = 0.3, axis = 0)
print('Variable Importance - Combined')
var_imp_combined = var_imp_xgb.merge(var_imp_gbc, on = 'Feature', suffixes = ['_xgb','_gbc'])
var_imp_combined.head(15).style.background_gradient(cmap = cm, high = 0.6, low = 0.3, axis = 0)
number_imp_features = 10
imp_features = var_imp_combined.sort_values(by = 'Var Imp_xgb', ascending = False).Feature[:number_imp_features]
print('List of significant variables:')
list(imp_features)
target = y_with_dummy_features.columns[0]
data_dummy_coded = X_with_dummy_features.join(y_with_dummy_features)
all_feature_vs_target = data_dummy_coded.groupby([target]).mean().T.reset_index().rename(columns={'index':'Feature',0:'Target False Avg',1:'Target True Avg'})
all_feature_vs_target['Direction vs Target'] = np.where(all_feature_vs_target['Target True Avg']>all_feature_vs_target['Target False Avg'],'Positive','Negative')
all_feature_vs_target.head(5)
result = var_imp_combined.merge(all_feature_vs_target, on = 'Feature')
result_imp = result[result.Feature.isin(list(imp_features))]
result_imp.style.background_gradient(cmap = cm, high = 0.6, low = 0.3, axis = 0, subset=pd.IndexSlice[:, ['Var Imp_xgb', 'Var Imp_gbc']])
result_imp[['Feature', 'Var Imp_xgb', 'Var Imp_gbc']].sort_values(by = 'Var Imp_xgb').plot(kind='barh', alpha=0.5)
plt.ylabel('Model', fontsize=15)
plt.xlabel('Var Imp', fontsize=15)
plt.yticks(range(0,len(result_imp.Feature)), result_imp.Feature, fontsize=10)
plt.show()
Local interpretable model-agnostic explanations (LIME) allow us to explain individual predictions for "black box" models by creating local, interpretable, surrogate models
LIME can be used in the following ways:
LIME general approach:
model_pred = trained_model_lr.predict(xtest)
# Extract False Positives predictions
wrong_pred = (model_pred != ytest.values.ravel() ) & (model_pred == 1)
#Modify "i" to select row
i = 20
wrong = xtest.iloc[wrong_pred].iloc[i]
wrong_act = ytest.iloc[wrong_pred].iloc[i][0]
print('False Positive:\nPrediction: {}'.format(model_pred[wrong_pred][i]))
print('Actual Value: {}'.format(wrong_act))
# Create a lime explainer object
explainer_lr = lime.lime_tabular.LimeTabularExplainer(training_data = xtrain.values,
mode = 'classification',
training_labels = ytest.values.ravel(),
feature_names = xtrain.columns)
# Explanation for wrong prediction
exp_lr = explainer_lr.explain_instance(data_row = np.ravel(wrong),
predict_fn = trained_model_lr.predict_proba)
# Create a lime explainer object
explainer_gbm = lime.lime_tabular.LimeTabularExplainer(training_data = xtrain.values,
mode = 'classification',
training_labels = ytest.values.ravel(),
feature_names = xtrain.columns)
# Explanation for wrong prediction
exp_gbm = explainer_gbm.explain_instance(data_row = np.ravel(wrong),
predict_fn = trained_model_grb.predict_proba)
LIME Explainer for Logistic Regression
exp_lr.show_in_notebook()
LIME Explainer for GBM
exp_gbm.show_in_notebook()
def lift_plot_model(ytest, yprob):
'''
Objective: Function to plot Lift Chart
Argument : Actual Take up rate(1/0), predicted probabilities
Returns : Lift chart, Lift table
Output : Lift Chart
'''
n_bins = 10
actual_ser = pd.Series(ytest).rename('actuals').reset_index()
proba_ser = pd.Series(yprob).rename('probabilities').reset_index()
# Join table and drop indicies
lift_table = pd.concat([actual_ser, proba_ser], axis=1).fillna(0)
#lift_table.drop('index', inplace=True)
actual_col = 'actuals'
probability_col = 'probabilities'
lift_table.sort_values(by=probability_col, ascending=False, inplace=True)
rows = []
# Split the data into the number of bins desired.
for group in np.array_split(lift_table, n_bins):
score = group[(group[actual_col] == 1)][actual_col].sum()
rows.append({'NumCases': len(group), 'NumCorrectPredictions': score})
lift = pd.DataFrame(rows)
#Cumulative Gains Calculation
lift['RunningCompleted'] = lift['NumCases'].cumsum() - lift['NumCases']
lift['PercentCorrect'] = lift['NumCorrectPredictions'].cumsum() / \
lift['NumCorrectPredictions'].sum() * 100
lift['AvgCase'] = lift['NumCorrectPredictions'].sum() / len(lift)
lift['CumulativeAvgCase'] = lift['AvgCase'].cumsum()
#lift['PercentAvgCase'] = lift['CumulativeAvgCase'].apply(
# lambda x: (x*1.0 / lift['NumCorrectPredictions'].sum()) * 100)
#Lift Chart
lift['LiftLine'] = 1
lift['Lift'] = lift['NumCorrectPredictions'] / lift['AvgCase']
plt.plot(lift['Lift'], label= 'Response rate for model');
plt.plot(lift['LiftLine'], 'r-', label='Normalised \'response rate\' \
with no model');
plt.xlabel(str(100/len(lift)) + '% Increments');
plt.ylabel('Lift');
plt.legend();
plt.title("Lift Chart");
plt.show();
return lift
#plt.gcf().clear()
def plot_roc(ytest_roc,yprob_roc):
'''
Objective: Function to plot ROC Graph
Argument : ytest: Actual Take up rate(1/0), yprob: predcicted probabilities
Returns : ROC Plot
Output : ROC Plot
'''
fig = plt.figure(1, figsize=(6, 6));
false_positive_rate, true_positive_rate, thresholds = \
roc_curve(ytest_roc, yprob_roc)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.title("Receiving Operator Characteristic");
plt.plot(false_positive_rate, true_positive_rate, 'b', \
label='AUC = %0.2f' % roc_auc);
plt.legend(loc='lower right');
plt.plot([0,1], [0,1], 'r--');
plt.xlim([-0.1, 1.2]);
plt.ylim([-0.1, 1.2]);
plt.ylabel("True Positive Rate");
plt.xlabel("False Positive Rate");
plt.tight_layout();
nfig = plt.figure(2, figsize=(6, 6));
plt.show();
#plt.gcf().clear()
def evaluate_model(model_name, trained_model, xtrain, xtest, ytrain, ytest, verbose = False, threshold = 0.5):
'''
Objective: Function to calculate and return key model performance metrics
Arguments: 7 arguments
1) model_name: Name of the model
2) trained_model: Trained model
3) xtrain: Training data set for features
4) xtest: testing dataset for features
5) ytrain: Training data set for target
6) ytest: testing dataset for target
7) verbose: print key performance metrics if True (default False)
8) threshold: Decision threshold used to classify the predicted probabilities
Returns : pd.DataFrame containing all key performance metrics
Output : pd.DataFrame containing all key performance metrics, ROC plot, Lift plot
'''
#Predict using trained model for training and test datasets (with and without probabilities)
prob_test = trained_model.predict_proba(xtest)
prob_train = trained_model.predict_proba(xtrain)
pred_test = (prob_test [:,1] >= threshold).astype('int')
pred_train = (prob_train [:,1] >= threshold).astype('int')
#Calculate AUC
auc_score = roc_auc_score(ytest, prob_test[:,1])
#Calculate train and test accuracy
train_acc = accuracy_score(ytrain.values.ravel(), pred_train)
test_acc = accuracy_score(ytest.values.ravel(), pred_test)
#Calculate log loss value
log_loss_value = log_loss(ytest, prob_test[:,1],eps=1e-15, normalize=True)
#Generate confusion matrix
conf_matrix = confusion_matrix(ytest.values.ravel(), pred_test)
#Calculate classification model evaluation metrics like precision, recall, f1 score
report = classification_report(ytest, pred_test)
precision,recall,fscore,support=precision_recall_fscore_support(ytest,pred_test)
print ("Lift plot for validation Sample")
lift_table = lift_plot_model(ytest.values.ravel(), prob_test[:,1])
print ("ROC curve for the validaton Sample")
plot_roc(ytest.values.ravel(), prob_test[:,1])
#Collate all key performance metrics into a dataframe
model_evaluation_metrics = pd.DataFrame({'Model': [model_name], 'AUC': [auc_score], 'Test Accuracy': [test_acc]
, 'Recall_1': [recall[1]], 'Precision_1': [precision[1]], 'F1 Score_1': [fscore[1]]
, 'Log loss': [log_loss_value]})
model_evaluation_metrics.columns
model_evaluation_metrics = model_evaluation_metrics[['Model', 'AUC', 'Test Accuracy',
'Recall_1', 'Precision_1', 'F1 Score_1','Log loss']]
#Get lifts in top n deciles, where n is defined below
n=2
lift_table.reset_index(inplace = True)
lift_table['index'] = lift_table['index'] + 1
lift_table['Decile'] = lift_table['index'].apply(lambda x: 'Decile_' + str(x) + ' Lift %')
top_decile_lifts = lift_table[0:n][['Decile','Lift']].T
top_decile_lifts.columns = top_decile_lifts.iloc[0]
top_decile_lifts = top_decile_lifts.reindex(top_decile_lifts.index.drop('Decile'))
top_decile_lifts.reset_index(drop=True,inplace=True)
#Add lift values for top deciles to the model key performance metrics
model_evaluation_metrics = pd.concat([model_evaluation_metrics,top_decile_lifts],axis=1)
#Print key performance metrics if verbose = True is passed as an argument
if verbose == True:
print ("Trained model :: ", trained_model)
print ("\n\nModel ROC-AUC score for validation sample: %.3f" \
% auc_score)
print ("\n\nTrain Accuracy :: ", train_acc)
print ("\n\nTest Accuracy :: ", test_acc)
print ("\n\nLog Loss Without for validation sample:", \
log_loss_value)
print ("\n\n Confusion matrix \n")
skplt.metrics.plot_confusion_matrix(ytest.values.ravel(), pred_test, title="Confusion Matrix",
figsize=(4,4),text_fontsize='large')
plt.show()
print("\n\n Classification report (weighted average across classes) ::\n", classification_report(ytest, pred_test))
return model_evaluation_metrics
def plot_grid_search(cv_results, grid_param_1, grid_param_2, name_param_1, name_param_2):
'''
Objective: To plot Validation Curve for GridSearchCV parameter tuning results
Arguments: 5 arguments
1) cv_results: Cross validation results from tuning
2) grid_param_1: List of parameter 1 values used for tuning
3) grid_param_2: List of parameter 2 values used for tuning
4) name_param_1: Parameter 1 name
5) name_param_2: Parameter 2 name
Output : Validation Curve plot with both parameters and CV results
'''
# Get Test Scores Mean and std for each grid search
scores_mean = cv_results['mean_test_score']
scores_mean = np.array(scores_mean).reshape(len(grid_param_2),len(grid_param_1))
scores_sd = cv_results['std_test_score']
scores_sd = np.array(scores_sd).reshape(len(grid_param_2),len(grid_param_1))
# Plot Grid search scores
_, ax = plt.subplots(1,1)
# Param1 is the X-axis, Param 2 is represented as a different curve (color line)
for idx, val in enumerate(grid_param_2):
ax.plot(grid_param_1, scores_mean[idx,:], '-o', label= name_param_2 + ': ' + str(val))
ax.set_title("Grid Search Scores", fontsize=20, fontweight='bold')
ax.set_xlabel(name_param_1, fontsize=16)
ax.set_ylabel('CV Average Score', fontsize=16)
ax.legend(loc="lower right", fontsize=8)
ax.grid('on')